{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import lightgbm as lgb\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import KFold\n",
    "import datetime\n",
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train= pd.read_csv('../../Large_output/train_merge.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas.api.types import is_datetime64_any_dtype as is_datetime\n",
    "from pandas.api.types import is_categorical_dtype\n",
    "\n",
    "def reduce_mem_usage(df, use_float16=False):\n",
    "    \"\"\"\n",
    "    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        \n",
    "    \"\"\"\n",
    "    \n",
    "    start_mem = df.memory_usage().sum() / 1024**2\n",
    "    print(\"Memory usage of dataframe is {:.2f} MB\".format(start_mem))\n",
    "    \n",
    "    for col in df.columns:\n",
    "        if is_datetime(df[col]) or is_categorical_dtype(df[col]):\n",
    "            continue\n",
    "        col_type = df[col].dtype\n",
    "        \n",
    "        if col_type != object:\n",
    "            c_min = df[col].min()\n",
    "            c_max = df[col].max()\n",
    "            if str(col_type)[:3] == \"int\":\n",
    "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
    "                    df[col] = df[col].astype(np.int8)\n",
    "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
    "                    df[col] = df[col].astype(np.int16)\n",
    "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
    "                    df[col] = df[col].astype(np.int32)\n",
    "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
    "                    df[col] = df[col].astype(np.int64)  \n",
    "            else:\n",
    "                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
    "                    df[col] = df[col].astype(np.float16)\n",
    "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
    "                    df[col] = df[col].astype(np.float32)\n",
    "                else:\n",
    "                    df[col] = df[col].astype(np.float64)\n",
    "        else:\n",
    "            df[col] = df[col].astype(\"category\")\n",
    "\n",
    "    end_mem = df.memory_usage().sum() / 1024**2\n",
    "    print(\"Memory usage after optimization is: {:.2f} MB\".format(end_mem))\n",
    "    print(\"Decreased by {:.1f}%\".format(100 * (start_mem - end_mem) / start_mem))\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def features_engineering(df):\n",
    "    \n",
    "    # Sort by localtime\n",
    "    df.sort_values(\"local_time\")\n",
    "    df.reset_index(drop=True)\n",
    "    \n",
    "    # Add more features\n",
    "    df[\"local_time\"] = pd.to_datetime(df[\"local_time\"],format=\"%Y-%m-%d %H:%M:%S\")\n",
    "    df[\"hour\"] = df[\"local_time\"].dt.hour\n",
    "    df[\"weekend\"] = df[\"local_time\"].dt.weekday\n",
    "    df['square_feet'] =  np.log1p(df['square_feet'])\n",
    "    \n",
    "    \n",
    "    # Encode Categorical Data\n",
    "    le = LabelEncoder()\n",
    "    df[\"primary_use\"] = le.fit_transform(df[\"primary_use\"])\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of dataframe is 2271.93 MB\n",
      "Memory usage after optimization is: 568.74 MB\n",
      "Decreased by 75.0%\n"
     ]
    }
   ],
   "source": [
    "df_train = reduce_mem_usage(df_train,use_float16=True)\n",
    "df_train.loc[:,'timestamp1']=df_train.timestamp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train.loc[:,'timestamp']=df_train.timestamp1\n",
    "df_train.timestamp=pd.to_datetime(df_train.timestamp, format='%Y-%m-%d %H:%M:%S')\n",
    "df_train.timestamp = (df_train.timestamp - pd.to_datetime(\"2016-01-01\")).dt.total_seconds() // 3600"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_is_bad_zero(Xy_subset, min_interval=48, summer_start=3000, summer_end=7500):\n",
    "    \"\"\"Helper routine for 'find_bad_zeros'.\n",
    "    \n",
    "    This operates upon a single dataframe produced by 'groupby'. We expect an \n",
    "    additional column 'meter_id' which is a duplicate of 'meter' because groupby \n",
    "    eliminates the original one.\"\"\"\n",
    "    meter = Xy_subset.meter_id.iloc[0]\n",
    "    is_zero = Xy_subset.meter_reading == 0\n",
    "    if meter == 0:\n",
    "        # Electrical meters should never be zero. Keep all zero-readings in this table so that\n",
    "        # they will all be dropped in the train set.\n",
    "        return is_zero\n",
    "\n",
    "    transitions = (is_zero != is_zero.shift(1))\n",
    "    all_sequence_ids = transitions.cumsum()\n",
    "    ids = all_sequence_ids[is_zero].rename(\"ids\")\n",
    "    if meter in [2, 3]:\n",
    "        # It's normal for steam and hotwater to be turned off during the summer\n",
    "        keep = set(ids[(Xy_subset.timestamp < summer_start) |\n",
    "                       (Xy_subset.timestamp > summer_end)].unique())\n",
    "        is_bad = ids.isin(keep) & (ids.map(ids.value_counts()) >= min_interval)\n",
    "    elif meter == 1:\n",
    "        time_ids = ids.to_frame().join(Xy_subset.timestamp).set_index(\"timestamp\").ids\n",
    "        is_bad = ids.map(ids.value_counts()) >= min_interval\n",
    "\n",
    "        # Cold water may be turned off during the winter\n",
    "        jan_id = time_ids.get(0, False)\n",
    "        dec_id = time_ids.get(8283, False)\n",
    "        if (jan_id and dec_id and jan_id == time_ids.get(500, False) and\n",
    "                dec_id == time_ids.get(8783, False)):\n",
    "            is_bad = is_bad & (~(ids.isin(set([jan_id, dec_id]))))\n",
    "    else:\n",
    "        raise Exception(f\"Unexpected meter type: {meter}\")\n",
    "\n",
    "    result = is_zero.copy()\n",
    "    result.update(is_bad)\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train_clean = pd.read_csv('../../Large_output/train_clean_merge.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of dataframe is 2638.86 MB\n",
      "Memory usage after optimization is: 733.78 MB\n",
      "Decreased by 72.2%\n"
     ]
    }
   ],
   "source": [
    "df_train_clean = reduce_mem_usage(df_train_clean,use_float16=True)\n",
    "train_engineer = features_engineering(df_train_clean)\n",
    "train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading']\\\n",
    "=train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading'].mul(0.2931)\n",
    "target = np.log1p(train_engineer[\"meter_reading\"])\n",
    "features = train_engineer[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\\\n",
    "                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = {\n",
    "    \"objective\": \"regression\",\n",
    "    \"boosting\": \"gbdt\",\n",
    "    \"num_leaves\": 1803,\n",
    "    \"learning_rate\": 0.05,\n",
    "    \"colsample_bytree\": 0.9,\n",
    "    \"reg_lambda\": 0.1,\n",
    "    'reg_alpha':0.1,\n",
    "    \"metric\": \"rmse\",\n",
    "    'max_bins':600,\n",
    "    'max_depth':-1,\n",
    "    'min_child_sample':20,\n",
    "    'min_child_weight':30,\n",
    "    'min_split_gain':0.1,\n",
    "    'subsample': 0.9,\n",
    "    'subsample_freq':1,\n",
    "     \n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical_features = [\"building_id\", \"site_id\", \"meter\", \n",
    "                        \"primary_use\",  \"weekend\",'is_holiday']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "building_id          0\n",
       "meter                0\n",
       "site_id              0\n",
       "primary_use          0\n",
       "square_feet          0\n",
       "air_temperature      0\n",
       "cloud_coverage       0\n",
       "dew_temperature      0\n",
       "precip_depth_1_hr    0\n",
       "hour                 0\n",
       "weekend              0\n",
       "is_holiday           0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/lightgbm/basic.py:1243: UserWarning: Using categorical_feature in Dataset.\n",
      "  warnings.warn('Using categorical_feature in Dataset.')\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training until validation scores don't improve for 50 rounds\n",
      "[25]\ttraining's rmse: 0.923136\tvalid_1's rmse: 0.999472\n",
      "[50]\ttraining's rmse: 0.72767\tvalid_1's rmse: 0.860393\n",
      "[75]\ttraining's rmse: 0.675437\tvalid_1's rmse: 0.840509\n",
      "[100]\ttraining's rmse: 0.646649\tvalid_1's rmse: 0.836474\n",
      "[125]\ttraining's rmse: 0.626953\tvalid_1's rmse: 0.836071\n",
      "[150]\ttraining's rmse: 0.611737\tvalid_1's rmse: 0.837473\n",
      "Early stopping, best iteration is:\n",
      "[116]\ttraining's rmse: 0.633365\tvalid_1's rmse: 0.835784\n",
      "Training until validation scores don't improve for 50 rounds\n",
      "[25]\ttraining's rmse: 0.920024\tvalid_1's rmse: 1.0214\n",
      "[50]\ttraining's rmse: 0.729434\tvalid_1's rmse: 0.859795\n",
      "[75]\ttraining's rmse: 0.679488\tvalid_1's rmse: 0.824596\n",
      "[100]\ttraining's rmse: 0.653321\tvalid_1's rmse: 0.811392\n",
      "[125]\ttraining's rmse: 0.63395\tvalid_1's rmse: 0.80658\n",
      "[150]\ttraining's rmse: 0.618901\tvalid_1's rmse: 0.805244\n",
      "[175]\ttraining's rmse: 0.609641\tvalid_1's rmse: 0.80511\n",
      "[200]\ttraining's rmse: 0.602338\tvalid_1's rmse: 0.805402\n",
      "Early stopping, best iteration is:\n",
      "[156]\ttraining's rmse: 0.616069\tvalid_1's rmse: 0.804922\n",
      "Training until validation scores don't improve for 50 rounds\n",
      "[25]\ttraining's rmse: 0.908754\tvalid_1's rmse: 1.02179\n",
      "[50]\ttraining's rmse: 0.705374\tvalid_1's rmse: 0.885728\n",
      "[75]\ttraining's rmse: 0.650905\tvalid_1's rmse: 0.867435\n",
      "[100]\ttraining's rmse: 0.619927\tvalid_1's rmse: 0.864945\n",
      "[125]\ttraining's rmse: 0.599729\tvalid_1's rmse: 0.86607\n",
      "[150]\ttraining's rmse: 0.584207\tvalid_1's rmse: 0.867747\n",
      "Early stopping, best iteration is:\n",
      "[102]\ttraining's rmse: 0.618417\tvalid_1's rmse: 0.864741\n"
     ]
    }
   ],
   "source": [
    "kf = KFold(n_splits=3)\n",
    "models = []\n",
    "for train_index,test_index in kf.split(features):\n",
    "    train_features = features.loc[train_index]\n",
    "    train_target = target.loc[train_index]\n",
    "    \n",
    "    test_features = features.loc[test_index]\n",
    "    test_target = target.loc[test_index]\n",
    "    \n",
    "    d_training = lgb.Dataset(train_features, label=train_target,categorical_feature=categorical_features, free_raw_data=False)\n",
    "    d_test = lgb.Dataset(test_features, label=test_target,categorical_feature=categorical_features, free_raw_data=False)\n",
    "    \n",
    "    model = lgb.train(params, train_set=d_training, num_boost_round=1000, valid_sets=[d_training,d_test], verbose_eval=25, early_stopping_rounds=50)\n",
    "    models.append(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_feature = pd.read_csv('../../Large_output/test_merge.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_feature = features_engineering(test_feature)\n",
    "row_ids = test_feature[['row_id']]\n",
    "test_feature = test_feature[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\\\n",
    "                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday', 'row_id']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/ipykernel_launcher.py:3: DeprecationWarning: elementwise comparison failed; this will raise an error in the future.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "results = []\n",
    "for model in models:\n",
    "    if  results == []:\n",
    "        results = np.expm1(model.predict(test_feature, num_iteration=model.best_iteration)) / len(models)\n",
    "    else:\n",
    "        results += np.expm1(model.predict(test_feature, num_iteration=model.best_iteration)) / len(models)\n",
    "    del model\n",
    "    gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_feature['meter_reading']=np.clip(results, 0, a_max=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>row_id</th>\n",
       "      <th>meter_reading</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>134.285627</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>69.056423</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>10.880237</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>251.449750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>1112.319660</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41697595</th>\n",
       "      <td>41697595</td>\n",
       "      <td>6.082773</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41697596</th>\n",
       "      <td>41697596</td>\n",
       "      <td>4.195885</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41697597</th>\n",
       "      <td>41697597</td>\n",
       "      <td>6.979827</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41697598</th>\n",
       "      <td>41697598</td>\n",
       "      <td>181.797817</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41697599</th>\n",
       "      <td>41697599</td>\n",
       "      <td>3.910399</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>41697600 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            row_id  meter_reading\n",
       "0                0     134.285627\n",
       "1                1      69.056423\n",
       "2                2      10.880237\n",
       "3                3     251.449750\n",
       "4                4    1112.319660\n",
       "...            ...            ...\n",
       "41697595  41697595       6.082773\n",
       "41697596  41697596       4.195885\n",
       "41697597  41697597       6.979827\n",
       "41697598  41697598     181.797817\n",
       "41697599  41697599       3.910399\n",
       "\n",
       "[41697600 rows x 2 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_feature.loc[(test_feature['site_id']==0) & \n",
    "                 (test_feature['meter']==0),'meter_reading']=test_feature.loc[(test_feature['site_id']==0) &\n",
    "                                                            (test_feature['meter']==0),'meter_reading'].mul(3.4118)\n",
    "df_result = pd.DataFrame({'row_id': row_ids['row_id'], 'meter_reading': test_feature['meter_reading']})\n",
    "df_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_result.to_csv('../../Large_output/lgb3_cleaned.csv', index = False)\n",
    "# 1.081"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
